#!/usr/bin/env node

// This script generates a JSON file that is used by the CLI to handle unicode property escapes.

const CATEGORY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-categories.json'
const PROPERTY_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-properties.json'
const CATEGORY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-category-aliases.json'
const PROPERTY_ALIAS_OUTPUT_PATH = './cli/src/generate/prepare_grammar/unicode-property-aliases.json'

const CATEGORY_URL = 'https://unicode.org/Public/13.0.0/ucd/UnicodeData.txt'
const PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/PropList.txt'
const DERIVED_PROPERTY_URL = 'https://unicode.org/Public/13.0.0/ucd/DerivedCoreProperties.txt'
const CATEGORY_ALIAS_URL = 'https://unicode.org/Public/13.0.0/ucd/PropertyValueAliases.txt'
const PROPERTY_ALIAS_URL = 'https://unicode.org/Public/13.0.0/ucd/PropertyAliases.txt'

const fs = require('fs');
const path = require('path');
const {spawnSync} = require('child_process');

// Download the unicode data files, caching them inside the 'target' directory.
const categoryData = cachedDownload(CATEGORY_URL);
const propertyData = cachedDownload(PROPERTY_URL);
const derivedPropertyData = cachedDownload(DERIVED_PROPERTY_URL);
const categoryAliasData = cachedDownload(CATEGORY_ALIAS_URL);
const propertyAliasData = cachedDownload(PROPERTY_ALIAS_URL);
function cachedDownload(url) {
    let downloadPath = path.join('.', 'target', path.basename(url))
    if (fs.existsSync(downloadPath)) {
        return fs.readFileSync(downloadPath, 'utf8');
    } else {
        const data = spawnSync('curl', [url], {encoding: 'utf8'}).stdout;
        fs.writeFileSync(downloadPath, data, 'utf8');
        return data;
    }
}

const categories = {};
const properties = {};
const categoryAliases = {};
const propertyAliases = {}
let data, row, lineStart, lineEnd;

// Parse the properties
data = propertyData + derivedPropertyData;
row = 0;
lineStart = 0;
lineEnd = -1;
const CODE_POINT = /[0-9A-Fa-f]/
while (lineStart < data.length) {
    row++;
    lineStart = lineEnd + 1;
    lineEnd = data.indexOf('\n', lineStart);
    if (lineEnd === -1) break;

    // Skip over blank and comment lines
    if (!CODE_POINT.test(data[lineStart])) continue;

    // Parse the first two semicolon fields:
    // * code point or code point range
    // * property
    const codePointEnd = data.indexOf(';', lineStart);
    const propertyStart = codePointEnd + 1;
    const propertyEnd = data.indexOf('#', propertyStart);

    if (
        codePointEnd === -1 ||
        propertyEnd === -1
    ) {
        throw new Error(`Unexpected format on line ${row}`);
    }

    // Process ranges (separated by '..)
    const codePoints = data.slice(lineStart, codePointEnd).trim()
        .split('..')
        .map(p => parseInt(p, 16));
    if (codePoints.length === 1) {
        codePoints.push(codePoints[0]);
    }

    const property = data.slice(propertyStart, propertyEnd).trim();

    console.log(codePoints, property);


    for (let c = codePoints[0]; c <= codePoints[1]; c++) {
        if (!properties[property]) {
            properties[property] = [];
        }
        properties[property].push(c);
    }
}

// Parse the categories.
// Each line represents a code point.
data = categoryData;
row = 0;
lineStart = 0;
lineEnd = -1;
while (lineStart < data.length) {
    row++;
    lineStart = lineEnd + 1;
    lineEnd = data.indexOf('\n', lineStart);
    if (lineEnd === -1) break;

    // Parse the first three semicolon-separated fields:
    // * code point (hexadecimal)
    // * name
    // * category
    const codePointEnd = data.indexOf(';', lineStart);
    const nameStart = codePointEnd + 1;
    const nameEnd = data.indexOf(';', nameStart);
    const categoryStart = nameEnd + 1;
    const categoryEnd = data.indexOf(';', categoryStart)
    if (
        nameStart === 0 ||
        categoryStart == 0 ||
        categoryEnd === -1
    ) {
        throw new Error(`Unexpected format on line ${row}`);
    }

    const codePoint = parseInt(data.slice(lineStart, codePointEnd), 16);
    const name = data.slice(nameStart, nameEnd);
    const category = data.slice(categoryStart, categoryEnd);

    console.log(codePoint, category, name);

    // Group the code points by their category.
    if (!categories[category]) {
        categories[category] = [];
    }
    categories[category].push(codePoint);
}

// Parse the category aliases
data = categoryAliasData;
row = 0;
lineStart = 0;
lineEnd = -1;
const IGNORE = /[#\s]/
while (lineStart < data.length) {
    row++;
    lineStart = lineEnd + 1;
    lineEnd = data.indexOf('\n', lineStart);
    if (lineEnd === -1) break;

    // Skip over blank and comment lines
    if (IGNORE.test(data[lineStart])) continue;

    // Parse the first three semicolon-separated fields:
    // * property value type
    // * short name
    // * long name
    // Other aliases may be listed in additional fields
    const propertyValueTypeEnd = data.indexOf(';', lineStart);
    const shortNameStart = propertyValueTypeEnd + 1;
    const shortNameEnd = data.indexOf(';', shortNameStart);
    const longNameStart = shortNameEnd + 1;
    if (
        shortNameStart === 0 ||
        longNameStart === 0
    ) {
        throw new Error(`Unexpected format on line ${row}`);
    }

    const propertyValueType = data.slice(lineStart, propertyValueTypeEnd).trim();
    const shortName = data.slice(shortNameStart, shortNameEnd).trim();

    // Filter for General_Category lines
    if (propertyValueType !== 'gc') continue;

    let aliasStart = longNameStart;
    let lineDone = false;
    do {
        let aliasEnd = data.indexOf(';', aliasStart);
        if (aliasEnd === -1 || aliasEnd > lineEnd) {
            aliasEnd = data.indexOf('#', aliasStart);
            if (aliasEnd === -1 || aliasEnd > lineEnd) {
                aliasEnd = lineEnd;
            }
            lineDone = true;
        }
        const alias = data.slice(aliasStart, aliasEnd).trim();
        console.log(alias, shortName);
        categoryAliases[alias] = shortName;
        aliasStart = aliasEnd + 1;
    } while (!lineDone);
}

// Parse the property aliases
data = propertyAliasData;
row = 0;
lineStart = 0;
lineEnd = -1;
while (lineStart < data.length) {
    row++;
    lineStart = lineEnd + 1;
    lineEnd = data.indexOf('\n', lineStart);
    if (lineEnd === -1) break;

    // Skip over blank and comment lines
    if (IGNORE.test(data[lineStart])) continue;

    // Parse the first two semicolon fields:
    // * short name
    // * long name
    const shortNameEnd = data.indexOf(';', lineStart);
    const longNameStart = shortNameEnd + 1;

    if (longNameStart == 0) {
        throw new Error(`Unexpected format on line ${row}`);
    }

    let alias = data.slice(lineStart, shortNameEnd).trim();
    let longName = null;
    let nameStart = longNameStart;
    let lineDone = false;
    do {
        let nameEnd = data.indexOf(';', nameStart);
        if (nameEnd === -1 || nameEnd > lineEnd) {
            nameEnd = data.indexOf('#', nameStart);
            if (nameEnd === -1 || nameEnd > lineEnd) {
                nameEnd = lineEnd;
            }
            lineDone = true;
        }
        if (longName == null) {
            longName = data.slice(nameStart, nameEnd).trim();
        } else {
            alias = data.slice(nameStart, nameEnd).trim();
        }
        console.log(alias, longName);
        propertyAliases[alias] = longName;
        nameStart = nameEnd + 1;
    } while (!lineDone);
}

fs.writeFileSync(CATEGORY_OUTPUT_PATH, JSON.stringify(categories), 'utf8');
fs.writeFileSync(PROPERTY_OUTPUT_PATH, JSON.stringify(properties), 'utf8');
fs.writeFileSync(CATEGORY_ALIAS_OUTPUT_PATH, JSON.stringify(categoryAliases), 'utf8');
fs.writeFileSync(PROPERTY_ALIAS_OUTPUT_PATH, JSON.stringify(propertyAliases), 'utf8');
